package com.cn.webspider;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class Webmagic implements PageProcessor {
    private Site site = Site.me().setRetryTimes(1).setSleepTime(100);
    int i = 1;
    int d = 1;

    @Override
    public void process(Page page) {
        String url = page.getUrl().get();
        String html = page.getHtml().xpath("html").get();
        Document parse = Jsoup.parse(html);
        if (url.contains("detail")) {
            //获取具体内容，跳到详情页查看
            System.out.println("标题：" + parse.title() +
                    "-------内容：" +
                    parse.select(".post-content p")
                            .first()
                            .toString()
                            .replaceAll("<br>|<p>|<a>|</b>|</b>", "") +
                    "------发布时间：" +
                    parse.select(".panel-body .text-center")
                            .get(1)
                            .childNode(2)
                            .toString());
        } else {
            //获取当前页的内容，把当前页内容获取完全
            Elements elements = parse.select(".col-md-8 .panel .list-group a");
            elements.forEach(e -> {
//                System.out.println(e.childNodeSize() > 0 && e.attr("title") != "" ? "第" + d++ + "条-----" + "标题：" + e.attr("title").trim() +
//                        "时间：" + e.childNode(0).childNode(0) : null);
//                System.out.println(e.attr("href"));
                //访问具体详情页面
                page.addTargetRequest("http://www.0818tuan.com" + e.attr("href") + "?detail");
            });
            Elements select = parse.select(".pager a");
            Element element1 = select.get(select.size() - 2);
            //赋值下一页内容继续爬取
            System.out.println("第---------------------------" + i++ + "页");
            page.addTargetRequest("http://www.0818tuan.com" + element1.attr("href"));
        }
//        Selectable css = page.getHtml().css(".col-md-8 .panel .list-group");
//        Selectable links = css.links();
//        List<String> all = css.all();
//        List<String> title = elements.stream().map(element -> element.attr("title")).collect(Collectors.toList());
//        title.stream().forEach(System.out::println);
//        System.out.println(page);
//        all.stream().forEach(System.out::println);
    }

    @Override
    public Site getSite() {
        return site;
    }

    @Test
    public void te1() {
        System.out.println("--------------------------------开始--------------------------------");
        Spider.create(new Webmagic())
                .addUrl("http://www.0818tuan.com/list-1-0.html")
                .run();
        System.out.println("--------------------------------结束--------------------------------");
    }
}
